Load data

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.0 ──
## ✔ broom        1.0.5     ✔ rsample      1.1.1
## ✔ dials        1.2.0     ✔ tibble       3.2.1
## ✔ dplyr        1.1.2     ✔ tidyr        1.3.0
## ✔ infer        1.0.4     ✔ tune         1.1.1
## ✔ modeldata    1.1.0     ✔ workflows    1.1.3
## ✔ parsnip      1.1.0     ✔ workflowsets 1.0.1
## ✔ purrr        1.0.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.6
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

Set parameter ranges based on value combinations used in experiments.

# Vectors of all possible conditions combinations
frs <- unique(CC.TotalData$Flow.rate)
chls <- unique(CC.TotalData$Chlorophyll)
guans <- unique(CC.TotalData$Guano)
lights <- unique(CC.TotalData$Light)
#frs <- as.numeric(as.character(unique(CC.TotalData$Flow.rate)))
#chls <- as.numeric(as.character(unique(CC.TotalData$Chlorophyll)))
#guans <- as.numeric(as.character(unique(CC.TotalData$Guano)))
#lights <- as.numeric(as.character(unique(CC.TotalData$Light)))
#guans <- c(1,2)
#lights <- c(1,2)
conditions <- expand.grid(frs,chls,guans,lights)

Loop through combinations

for (i in 1:dim(conditions)[1])
{
  velocity <- CC.TotalData$smooth.v[
  (CC.TotalData$Flow.rate==conditions[i,1] & 
     CC.TotalData$Chlorophyll==conditions[i,2] & 
     as.numeric(CC.TotalData$Guano)==conditions[i,3] &
     as.numeric(CC.TotalData$Light)==conditions[i,4])]
  if (length(velocity) <= 1 | length(velocity[!is.na(velocity)])==0) {
    conditions[i,5:9] <- NA } else {
        # Because we're using the smoothed data, we'll average the velocities into 30-step bins
      idx <- 1:length(velocity)
      bx <- seq(from=0.5, to=length(velocity), by=30)
      velocity <- binMeans(y = velocity, x = idx, bx = bx)
      
      c<-cor.test(log10(velocity[1:length(velocity)-1]),
                  log10(velocity[2:length(velocity)]))
      conditions[i,5] <- c$estimate
        v0 <- log10(velocity[1:length(velocity)-1])
        v1 <- log10(velocity[2:length(velocity)])
        df <- data.frame(v0,v1)
        fit <- lm(v1 ~ v0 + 1, data = df)
        conditions[i,6] <- fit$coefficients[2]
        conditions[i,7] <- fit$coefficients[1]
        conditions[i,8] <- sd(fit$residuals)
        d <- dip.test(velocity)
        conditions[i,9] <- d$p.value
        hist(log10(velocity),100,
             main=d$p.value)
  }  
}

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

conditions <- setNames(conditions,c("flow.rate","chlorophyll","guano","light","corr.val","slope","intercept","sigma","dip.test"))

Using random forest to fit the missing values for autocorrelation

Idata <- which(!is.na(conditions[,5])) # Index of data
Iblank <- which(is.na(conditions[,5])) # Index of blank data

conditions_data <- conditions[Idata,]
conditions_blank <- conditions[Iblank,]

conditions.rf.slope <- randomForest(slope ~ .,
                              data=select(conditions_data,c(1,2,3,4,6)),
                              importance=TRUE,
                              proximity=TRUE)
plot(conditions.rf.slope)

varImpPlot(conditions.rf.slope)

conditions_blank$slope <- predict(conditions.rf.slope,conditions_blank[,1:4])
fit.slope <- predict(conditions.rf.slope,conditions_data[,1:4])
plot(fit.slope,conditions_data$slope)

plot(conditions_blank$chlorophyll,conditions_blank$slope)

plot(conditions_blank$flow.rate,conditions_blank$slope)

plot(conditions_blank$light,conditions_blank$slope)

plot(conditions_blank$guano,conditions_blank$slope)

conditions.rf.intercept <- randomForest(intercept ~ .,
                              data=select(conditions_data,c(1,2,3,4,7)),
                              importance=TRUE,
                              proximity=TRUE)
plot(conditions.rf.intercept)

varImpPlot(conditions.rf.intercept)

conditions_blank$intercept <- predict(conditions.rf.intercept,conditions_blank[,1:4])
fit.intercept <- predict(conditions.rf.intercept,conditions_data[,1:4])
plot(fit.intercept,conditions_data$intercept)

plot(conditions_blank$chlorophyll,conditions_blank$intercept)

plot(conditions_blank$flow.rate,conditions_blank$intercept)

plot(conditions_blank$light,conditions_blank$intercept)

plot(conditions_blank$guano,conditions_blank$intercept)

conditions.rf.sigma <- randomForest(sigma ~ .,
                              data=select(conditions_data,c(1,2,3,4,8)),
                              importance=TRUE,
                              proximity=TRUE)
plot(conditions.rf.sigma)

varImpPlot(conditions.rf.sigma)

conditions_blank$sigma <- predict(conditions.rf.sigma,conditions_blank[,1:4])
fit.sigma <- predict(conditions.rf.sigma,conditions_data[,1:4])
plot(fit.sigma,conditions_data$sigma)

plot(conditions_blank$chlorophyll,conditions_blank$sigma)

plot(conditions_blank$flow.rate,conditions_blank$sigma)

plot(conditions_blank$light,conditions_blank$sigma)

plot(conditions_blank$guano,conditions_blank$sigma)

Save fit models

save(conditions.rf.slope,conditions.rf.intercept,conditions.rf.sigma, file='notebook13-rf-2023.10.24data.RData')